Load Packages
library(tidyverse)
library(cowplot)
library(ggthemes)
library(splatter)
library(scater)
library(scran)
library(scRNAseq)
theme_set(theme_few())
all_metadata <- read_csv("scRNA_coldata.csv")
batch <- all_metadata[!is.na(str_extract(all_metadata$vars, regex("batch", ignore_case = T))), ]
tech <- all_metadata[!is.na(str_extract(all_metadata$vars, regex("tech", ignore_case = T))), ]
bind_rows(batch, tech) %>% distinct(data)
## # A tibble: 10 × 1
## data
## <chr>
## 1 AztekinTailData()
## 2 BacherTCellData()
## 3 CampbellBrainData()
## 4 GiladiHSCData(mode='rna')
## 5 LedergorMyelomaData()
## 6 KotliarovPBMCData()
## 7 MessmerESCData()
## 8 PaulHSCData()
## 9 ZilionisLungData('mouse')
## 10 WuKidneyData()
Aztekin Tail Data
AztekinTailData <- AztekinTailData()
AztekinTailData
## class: SingleCellExperiment
## dim: 31535 13199
## metadata(0):
## assays(1): counts
## rownames(31535): Xelaev18000001m.g Xelaev18000003m.g ... loc398467.S
## Xetrov90026938m.S
## rowData names(0):
## colnames(13199): AAACCTGAGCTAGTTC.1 AAACCTGGTGGGTCAA.1 ...
## TTGTAGGCAGTACACT.1 TTTGCGCAGCGTGAAC.1.1
## colData names(9): cell sample ... Condition batch
## reducedDimNames(1): UMAP
## mainExpName: NULL
## altExpNames(0):
table(Batch = colData(AztekinTailData)$batch)
## Batch
## 1 2 3 4
## 1552 3277 2354 6016
table(Cluster = colData(AztekinTailData)$cluster)
## Cluster
## Alpha ionocyte Anterior notochord
## 56 34
## Beta ionocyte Dermomyotome
## 207 84
## Differentiating myocyte Differentiating neuron
## 12 158
## Dopaminergic neurons Epidermis
## 80 1800
## Erythrocyte 1 Erythrocyte 2
## 2761 132
## Erythrocyte 3 Erythrocyte 4
## 982 862
## Floor plate Goblet cell
## 109 1754
## Interneuron 1 Interneuron 2
## 128 73
## Interneuron 3 Interneuron 4
## 24 23
## laminin-rich epidermis Lymphoid 1 (Gata2-, Gata3+)
## 21 56
## Lymphoid 2 (Cxcr6+) Lymphoid 3
## 65 31
## Lymphoid 4 (Gata2+, Gata3-) Lymphoid 5 (CD19+)
## 77 14
## Lymphoid endothelial cells Melanocyte
## 12 25
## Melanocyte precursor Melanocyte stem cell
## 21 37
## Mesenchyme Motor neuron
## 215 76
## Motor neuron (leptin+) Myeloid 1
## 71 475
## Myeloid 2 Myotome
## 265 303
## Oligodendrocyte Posterior notochord
## 8 54
## ROCs Satellite cell
## 254 44
## Sclerotome Skeletal muscle
## 752 117
## Small secretory cell Smooth muscle
## 37 11
## Spinal cord progenitor Syndetome
## 505 15
## Vascular endothelial cell Vulnerable Motor Neuron
## 75 284
AztekinTailData <- logNormCounts(AztekinTailData)
# Subset to top 10% HVG
chosen_features <-
getTopHVGs(modelGeneVar(AztekinTailData, AztekinTailData$batch),
n = ceiling(nrow(AztekinTailData) / 10))
AztekinTailData <- AztekinTailData[chosen_features, ]
# Subset to two smallest batches
AztekinTailData <- AztekinTailData[, colData(AztekinTailData)$batch %in% c(1, 3)]
# Create default named factors for batches and cell clusters
colData(AztekinTailData)$Batch <- factor(colData(AztekinTailData)$batch)
colData(AztekinTailData)$Cluster <- factor(colData(AztekinTailData)$cluster)
AztekinTailData <- runPCA(AztekinTailData)
plotPCA(AztekinTailData, color_by = "Batch")

plotPCA(AztekinTailData, color_by = "Cluster", add_legend = F)

AztekinTailData <- runTSNE(AztekinTailData)
plotTSNE(AztekinTailData, color_by = "Batch")

plotTSNE(AztekinTailData, color_by = "Cluster", add_legend = F)

AztekinTailData <- runUMAP(AztekinTailData)
plotUMAP(AztekinTailData, color_by = "Batch")

plotUMAP(AztekinTailData, color_by = "Cluster", add_legend = F)

Bacher T-Cell Data
BacherTCellData <- BacherTCellData()
BacherTCellData
## class: SingleCellExperiment
## dim: 33538 104417
## metadata(0):
## assays(1): counts
## rownames(33538): MIR1302-2HG FAM138A ... AC213203.1 FAM231C
## rowData names(0):
## colnames(104417): AAACCTGAGAGAACAG-J09835 AAACCTGAGTGCGTGA-J09835 ...
## TTTGGTTAGTGTTTGC-J21856 TTTGGTTTCCAACCAA-J21856
## colData names(22): orig.ident nCount_RNA ... who_class severity
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
table(Batch = colData(BacherTCellData)$batch)
## Batch
## 1 2 3 4 5 10 11 12 13 14 15 16 17
## 2547 11585 13344 12904 3059 587 7740 9961 4623 754 1113 376 296
## 18 19
## 17915 17613
table(Cluster = colData(BacherTCellData)$new_cluster_names)
## Cluster
## Central memory Cycling Cytotoxic / Th1
## 18391 606 14262
## Tfh-like Transitional memory Type-1 IFN signature
## 45732 24165 1261
BacherTCellData <- logNormCounts(BacherTCellData)
# Subset to top 10% HVG
chosen_features <-
getTopHVGs(modelGeneVar(BacherTCellData, BacherTCellData$batch),
n = ceiling(nrow(BacherTCellData) / 10))
BacherTCellData <- BacherTCellData[chosen_features, ]
# Subset to three smallest batches
BacherTCellData <- BacherTCellData[, colData(BacherTCellData)$batch %in% c(10, 16, 17)]
# Create default named factors for batches and cell clusters
colData(BacherTCellData)$Batch <- factor(colData(BacherTCellData)$batch)
colData(BacherTCellData)$Cluster <- factor(colData(BacherTCellData)$new_cluster_names)
BacherTCellData <- runPCA(BacherTCellData)
plotPCA(BacherTCellData, color_by = "Batch")

plotPCA(BacherTCellData, color_by = "Cluster")

BacherTCellData <- runTSNE(BacherTCellData)
plotTSNE(BacherTCellData, color_by = "Batch")

plotTSNE(BacherTCellData, color_by = "Cluster")

BacherTCellData <- runUMAP(BacherTCellData)
plotUMAP(BacherTCellData, color_by = "Batch")

plotUMAP(BacherTCellData, color_by = "Cluster")

Campbell Brain Data
CampbellBrainData <- CampbellBrainData()
CampbellBrainData
## class: SingleCellExperiment
## dim: 26774 21086
## metadata(0):
## assays(1): counts
## rownames(26774): 0610005C13Rik 0610007P14Rik ... Tfpi2 Trex2
## rowData names(0):
## colnames(21086): arc1_TACTAACAGTAN arc1_CCGCGAGCTCTT ...
## MaleFed_TGACGCGTTCTT MaleFed_GGGGCTTATTGN
## colData names(11): ID group ... clust_neurons Sex_pred
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
table(Batch = colData(CampbellBrainData)$batches)
## Batch
## b1 b2 b3 b4 b5 b6
## 2607 2463 2288 6504 5277 1947
table(Cluster = colData(CampbellBrainData)$clust_all)
## Cluster
## a01 a02 a03 a04 a05 a06 a07 a08 a09 a10 a11 a12 a13
## 392 131 240 84 224 627 330 172 467 99 1184 3504 37
## a14 a15 a16 a17 a18 a19 a20 miss
## 502 693 533 799 10515 150 238 165
CampbellBrainData <- logNormCounts(CampbellBrainData)
# Subset to top 10% HVG
chosen_features <-
getTopHVGs(modelGeneVar(CampbellBrainData, CampbellBrainData$batches),
n = ceiling(nrow(CampbellBrainData) / 10))
CampbellBrainData <- CampbellBrainData[chosen_features, ]
# Subset to two smallest batches
CampbellBrainData <- CampbellBrainData[, colData(CampbellBrainData)$batches %in% c("b3", "b6")]
# Create default named factors for batches and cell clusters
colData(CampbellBrainData)$Batch <- factor(colData(CampbellBrainData)$batches)
colData(CampbellBrainData)$Cluster <- factor(colData(CampbellBrainData)$clust_all)
CampbellBrainData <- runPCA(CampbellBrainData)
plotPCA(CampbellBrainData, color_by = "Batch")

plotPCA(CampbellBrainData, color_by = "Cluster", add_legend = F)

CampbellBrainData <- runTSNE(CampbellBrainData)
plotTSNE(CampbellBrainData, color_by = "Batch")

plotTSNE(CampbellBrainData, color_by = "Cluster", add_legend = F)

CampbellBrainData <- runUMAP(CampbellBrainData)
plotUMAP(CampbellBrainData, color_by = "Batch")

plotUMAP(CampbellBrainData, color_by = "Cluster", add_legend = F)

Messmer ESC Data
MessmerESCData <- MessmerESCData()
MessmerESCData
## class: SingleCellExperiment
## dim: 58302 1344
## metadata(0):
## assays(1): counts
## rownames(58302): ENSG00000223972 ENSG00000227232 ... ENSG00000277475
## ENSG00000268674
## rowData names(1): Length
## colnames(1344): lane1_3600STDY6077864_ATCACGTT_L001_15986_1
## lane1_3600STDY6077865_CGATGTTT_L001_15986_1 ...
## lane1_3600STDY6184179_TGGTTGAC_L001_17592_1
## lane1_3600STDY6184180_ACCTGCTG_L001_17592_1
## colData names(6): Source Name phenotype ... experiment batch sequencing
## run
## reducedDimNames(0):
## mainExpName: endogenous
## altExpNames(1): ERCC
table(Batch = colData(MessmerESCData)$`experiment batch`)
## Batch
## 1 2 3
## 192 768 384
table(Cluster = colData(MessmerESCData)$phenotype)
## Cluster
## naive primed
## 480 864
MessmerESCData <- logNormCounts(MessmerESCData)
# Subset to top 10% HVG
chosen_features <-
getTopHVGs(modelGeneVar(MessmerESCData, MessmerESCData$`experiment batch`),
n = ceiling(nrow(MessmerESCData) / 10))
MessmerESCData <- MessmerESCData[chosen_features, ]
# Create default named factors for batches and cell clusters
colData(MessmerESCData)$Batch <- factor(colData(MessmerESCData)$`experiment batch`)
colData(MessmerESCData)$Cluster <- factor(colData(MessmerESCData)$phenotype)
MessmerESCData <- runPCA(MessmerESCData)
plotPCA(MessmerESCData, color_by = "Batch")

plotPCA(MessmerESCData, color_by = "Cluster")

MessmerESCData <- runTSNE(MessmerESCData)
plotTSNE(MessmerESCData, color_by = "Batch")

plotTSNE(MessmerESCData, color_by = "Cluster")

MessmerESCData <- runUMAP(MessmerESCData)
plotUMAP(MessmerESCData, color_by = "Batch")

plotUMAP(MessmerESCData, color_by = "Cluster")

Paul HSC Data
PaulHSCData <- PaulHSCData()
PaulHSCData
## class: SingleCellExperiment
## dim: 25188 10368
## metadata(0):
## assays(1): counts
## rownames(25188): 0610007L01Rik 0610007P14Rik ... mt-Nd4 rp9
## rowData names(0):
## colnames(10368): W29953 W29954 ... W76335 W76336
## colData names(13): Well_ID Seq_batch_ID ... CD34_measurement
## FcgR3_measurement
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
table(Batch = colData(PaulHSCData)$Seq_batch_ID)
## Batch
## SB17 SB19 SB20 SB23 SB25 SB29
## 1152 2304 768 2304 1536 2304
table(Cluster = colData(PaulHSCData)$Batch_desc)
## Cluster
## Cebpa control Cebpa KO Cebpe control
## 1152 2304 384
## Cebpe KO CMP CD41 CMP Flt3+ Csf1r+
## 768 384 1152
## CMP Irf8-GFP+ MHCII+ Unsorted myeloid
## 1152 3072
PaulHSCData <- logNormCounts(PaulHSCData)
# Subset to top 10% HVG
chosen_features <-
getTopHVGs(modelGeneVar(PaulHSCData, PaulHSCData$Seq_batch_ID),
n = ceiling(nrow(PaulHSCData) / 10))
PaulHSCData <- PaulHSCData[chosen_features, ]
# Subset to two smallest batches
PaulHSCData <- PaulHSCData[, colData(PaulHSCData)$Seq_batch_ID %in% c("SB17", "SB20")]
# Create default named factors for batches and cell clusters
colData(PaulHSCData)$Batch <- factor(colData(PaulHSCData)$Seq_batch_ID)
colData(PaulHSCData)$Cluster <- factor(colData(PaulHSCData)$Batch_desc)
PaulHSCData <- runPCA(PaulHSCData)
plotPCA(PaulHSCData, color_by = "Batch")

plotPCA(PaulHSCData, color_by = "Cluster")

PaulHSCData <- runTSNE(PaulHSCData)
plotTSNE(PaulHSCData, color_by = "Batch")

plotTSNE(PaulHSCData, color_by = "Cluster")

PaulHSCData <- runUMAP(PaulHSCData)
plotUMAP(PaulHSCData, color_by = "Batch")

plotUMAP(PaulHSCData, color_by = "Cluster")

Zilionis Lung Data
ZilionisLungData <- ZilionisLungData('mouse')
ZilionisLungData
## class: SingleCellExperiment
## dim: 28205 17549
## metadata(0):
## assays(1): counts
## rownames(28205): 0610007P14Rik 0610009B22Rik ... mt-Nd5 mt-Nd6
## rowData names(0):
## colnames(17549): bc0001 bc0002 ... bc1087 bc1088
## colData names(12): Library Barcode ... Major cell type Minor subset
## reducedDimNames(1): SPRING
## mainExpName: NULL
## altExpNames(0):
table(Batch = colData(ZilionisLungData)$`Library prep batch`)
## Batch
## round1_20151128 round2_20151217 round3_20160313
## 4339 3639 7961
table(Cluster = colData(ZilionisLungData)$`Major cell type`)
## Cluster
## B cells Basophils MoMacDC Neutrophils NK cells pDC
## 2813 34 2397 8022 630 62
## T cells
## 1981
ZilionisLungData <- logNormCounts(ZilionisLungData)
# Subset to top 10% HVG
chosen_features <-
getTopHVGs(
modelGeneVar(ZilionisLungData, ZilionisLungData$`Library prep batch`),
n = ceiling(nrow(ZilionisLungData) / 10)
)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
ZilionisLungData <- ZilionisLungData[chosen_features, ]
# Subset to two smallest batches
ZilionisLungData <- ZilionisLungData[, colData(ZilionisLungData)$`Library prep batch` %in% c("round1_20151128", "round2_20151217")]
# Create default named factors for batches and cell clusters
colData(ZilionisLungData)$Batch <- factor(colData(ZilionisLungData)$`Library prep batch`)
colData(ZilionisLungData)$Cluster <- factor(colData(ZilionisLungData)$`Major cell type`)
ZilionisLungData <- runPCA(ZilionisLungData)
plotPCA(ZilionisLungData, color_by = "Batch")

plotPCA(ZilionisLungData, color_by = "Cluster")

ZilionisLungData <- runTSNE(ZilionisLungData)
plotTSNE(ZilionisLungData, color_by = "Batch")

plotTSNE(ZilionisLungData, color_by = "Cluster")

ZilionisLungData <- runUMAP(ZilionisLungData)
plotUMAP(ZilionisLungData, color_by = "Batch")

plotUMAP(ZilionisLungData, color_by = "Cluster")

Wu Kidney Data
WuKidneyData <- WuKidneyData()
WuKidneyData
## class: SingleCellExperiment
## dim: 18249 17542
## metadata(0):
## assays(1): counts
## rownames(18249): mt-Cytb mt-Nd6 ... Gm44613 Gm38304
## rowData names(0):
## colnames(17542): sCellDropseq_AAAAAGAAGGAC sCellDropseq_AAAACTACCTTC
## ... UUO_TTGCCGTCACAAGACG UUO_TTTGTCATCTGCTGTC
## colData names(2): Technology Status
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
table(Batch = colData(WuKidneyData)$Technology)
## Batch
## DroNcSeq sCellDropseq sNuc-10x sNucDropseq
## 2769 3531 8231 3011
table(Cluster = colData(WuKidneyData)$Status)
## Cluster
## disease healthy
## 6147 11395
WuKidneyData <- logNormCounts(WuKidneyData)
# Subset to top 10% HVG
chosen_features <-
getTopHVGs(modelGeneVar(WuKidneyData, WuKidneyData$Technology),
n = ceiling(nrow(WuKidneyData) / 10))
WuKidneyData <- WuKidneyData[chosen_features, ]
# Subset to two smallest batches
WuKidneyData <- WuKidneyData[, colData(WuKidneyData)$Technology %in% c("sCellDropseq", "sNuc-10x")]
# Create default named factors for batches and cell clusters
colData(WuKidneyData)$Batch <- factor(colData(WuKidneyData)$Technology)
colData(WuKidneyData)$Cluster <- factor(colData(WuKidneyData)$Status)
WuKidneyData <- runPCA(WuKidneyData)
plotPCA(WuKidneyData, color_by = "Batch")

plotPCA(WuKidneyData, color_by = "Cluster")

WuKidneyData <- runTSNE(WuKidneyData)
plotTSNE(WuKidneyData, color_by = "Batch")

plotTSNE(WuKidneyData, color_by = "Cluster")

WuKidneyData <- runUMAP(WuKidneyData)
plotUMAP(WuKidneyData, color_by = "Batch")

plotUMAP(WuKidneyData, color_by = "Cluster")
